library(RColorBrewer)
library(ISLR)
library(MASS)
library(rpart)
library(rpart.plot)
library(ggplot2)
theme_set(theme_bw(base_size = 14))
cols <- brewer.pal(9, "Set1")

Trees

One dimensional \(X\)

# Load the data
data("Boston")


# Split the data in training and test
n <- nrow(Boston)
n_train <- floor(0.8 * n)
n_test <- n - n_train

set.seed(123)
idx_test <- sample(1:n, n_test, replace = F)
Boston_ts <- Boston[idx_test,]
Boston <- Boston[-idx_test,]

# Create grid where we make prediction
xgrid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 1000)


# Fit a deep tree
deep_tree <- rpart(medv ~ lstat, method = "anova", data = Boston,
                   control = rpart.control(minsplit = 5, cp = .0005))
cat('Size of deep tree:', length(unique(deep_tree$where)), '\n')
Size of deep tree: 78 
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(deep_tree)

# Pick the best Cp
best_alpha <- deep_tree$cptable[which.min(deep_tree$cptable[,"xerror"]),"CP"]
cat('Best alpha:', best_alpha, '\n')
Best alpha: 0.00607656 
# Prune the tree down to that Cp
best_tree <- prune(deep_tree, cp = best_alpha)
cat('Size of best tree:', length(unique(best_tree$where)), '\n')
Size of best tree: 7 
# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)

best_tree
n= 404 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 404 35629.5400 22.72426  
   2) lstat>=5.44 340 14113.0300 19.93265  
     4) lstat>=14.395 145  3091.8700 15.13103  
       8) lstat>=19.83 64  1039.8170 12.45625 *
       9) lstat< 19.83 81  1232.3800 17.24444 *
     5) lstat< 14.395 195  5192.2580 23.50308  
      10) lstat>=9.725 90   658.9840 21.04667 *
      11) lstat< 9.725 105  3524.7420 25.60857 *
   3) lstat< 5.44 64  4790.5990 37.55469  
     6) lstat>=4.475 30  1309.4700 32.49667 *
     7) lstat< 4.475 34  2036.4090 42.01765  
      14) lstat>=3.425 18  1207.3360 39.17222 *
      15) lstat< 3.425 16   519.3844 45.21875 *
pred <- as.numeric(predict(best_tree, newdata = list(lstat = xgrid)))

par(mar=c(4,4,2,2), family = 'serif')
plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, pred, lwd = 2, col = cols[1])

Show suboptimal trees

alpha_vec <- c(0.1, best_alpha, 3E-3)
par(mfrow=c(3,2), mar = c(2,2,2,2), family = 'serif')
for(i in 1:3){
  plot(Boston$lstat, Boston$medv, pch = 16, cex = .5, 
       xlab = 'lstat', ylab = 'medv')
  ptree <- prune(deep_tree, cp = alpha_vec[i])
  pred <- as.numeric(predict(ptree, newdata = list(lstat = xgrid)))
  lines(xgrid, pred, col = 'red', lwd = 2)
  title(paste('alpha =',round(alpha_vec[i],4)))
  rpart.plot(ptree, clip.right.labs = FALSE, under = TRUE, digits = 4)
}

# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_1D <- (pred_y - Boston_ts$medv)^2

plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')

Two dimensional \(X\)

n_lev <- 1000
cols_cont <- colorRampPalette(cols[c(2,3,6,1)])(n_lev)
levels <- seq(min(Boston$medv), min(Boston$medv), length.out = n_lev)

par(mar=c(4,4,2,2), family = 'serif')
plot(Boston$lstat, Boston$dis, pch = 16, xlab = 'lstat', ylab = 'dis', col = cols_cont[cut(Boston$medv, n_lev)])

best_tree <- rpart(medv ~ lstat + dis, method = "anova", data = Boston)
cat('Size of optimal tree:', length(unique(best_tree$where)), '\n')
Size of optimal tree: 8 
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(best_tree)

# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)

# Make perspective plot
x1_grid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 100)
x2_grid <- seq(min(Boston$dis), max(Boston$dis), length.out = 100)
X_pred <- expand.grid(x1_grid, x2_grid)
X_pred <- data.frame(lstat = X_pred[,1], dis = X_pred[,2])

pred <- predict(best_tree, newdata = X_pred)

par(mar=c(1,1,1,1), family = 'serif')
persp(x1_grid, x2_grid, matrix(pred, ncol = length(x2_grid), byrow = T),
      theta = 150, xlab = 'dis', ylab = 'lstat', zlab = 'medv',
      zlim = range(Boston$medv), lwd = 0.2)

# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_2D <- (pred_y - Boston_ts$medv)^2

plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')

sqrt(mean(err_1D)); sqrt(mean(err_2D))
[1] 6.022948
[1] 5.147602

Using all the covariates

best_tree <- rpart(medv ~ ., method = "anova", data = Boston)
cat('Size of big tree:', length(unique(best_tree$where)), '\n')
Size of big tree: 8 
# Plot the cv error curve for the tree
par(mar=c(4,4,4,2), family = 'serif')
plotcp(best_tree)

# Show the optimal tree
rpart.plot(best_tree, clip.right.labs = FALSE, under = TRUE, digits = 4)

# Prediction on the test set
pred_y <- as.numeric(predict(best_tree, newdata = Boston_ts))
err_all <- (pred_y - Boston_ts$medv)^2

plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')

sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all))
[1] 6.022948
[1] 5.147602
[1] 4.743209


Esemble methods

Bagging and Random forests

# Let's first try to show how Bagging works 
xgrid <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 1000)
B <- 1000
pred_y <- array(NA, dim = c(B, length(xgrid)))
for (b in 1:B){
  Boston_b <- Boston[sample(1:n_train, n_train, TRUE),]
  
  deep_tree <- rpart(medv ~ lstat, method = "anova", data = Boston_b,
                     control = rpart.control(minsplit = 5, cp = .0025))
  
  pred_y[b,] <- as.numeric(predict(deep_tree, newdata = list(lstat = xgrid)))
}

plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, pred_y[1,], lwd = 2, col = 'red')
lines(xgrid, pred_y[2,], lwd = 2, col = 'blue')
lines(xgrid, pred_y[3,], lwd = 2, col = 'green')

plot(Boston$lstat, Boston$medv, pch = 16, xlab = 'lstat', ylab = 'medv')
lines(xgrid, colMeans(pred_y), lwd = 2, col = 'red')

# Let's now use the package for random forests
rffit <- randomForest(medv ~ ., data = Boston, ntree = 200)
plot(rffit, main = 'OOB Error')

# Maybe we can achieve better OOB error?

rffit <- randomForest(medv ~ ., data = Boston, ntree = 1000)
plot(rffit, main = 'OOB Error')

pred_y <- predict(rffit, newdata = Boston_ts)


plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')

err_RF <- (pred_y - Boston_ts$medv)^2
sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all)); sqrt(mean(err_RF))
[1] 6.022948
[1] 5.147602
[1] 4.743209
[1] 3.482059
# We can change the number of variables to be considered at each split
p <- ncol(Boston) - 1

rffit1 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = p)
plot(rffit1, main = 'OOB Error', ylim = c(10, 25))

rffit2 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = floor(p/2))
plot(rffit2, add = T, col = 'red')

rffit3 <- randomForest(medv ~ ., data = Boston, ntree = 500, mtry = floor(sqrt(p)))
plot(rffit3, add = T, col = 'blue')

legend('topright', legend = c('m = p', 'm = p/2', 'm = sqrt(p)'), col = c('black', 'red', 'blue'), lwd = 2)

plot(importance(rffit3), xlab = 'variables', ylab = 'importance', 
     xaxt = 'n', type = 'h', lwd = 3)
axis(1, at = 1:length(importance(rffit3)), labels = rownames(importance(rffit3)))

Boosting

set.seed(123)
n <- 500
lambda <- 0.5
X <- runif(n, 0, 10)
Y <- rnorm(n, sin(2*X), 0.75)
xgrid <- seq(min(X), max(X), length.out = 1000)

shallow_tree <- rpart(Y ~ X, method = "anova", control = list(maxdepth = 4))

pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))


plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')

res <- Y - lambda * pred_y_train

shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))


pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))


plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')

res <- res - lambda * pred_y_train

shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))


pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))


plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')

res <- res - lambda * pred_y_train

shallow_tree <- rpart(res ~ X, method = "anova", control = list(maxdepth = 4))


pred_y <- as.numeric(predict(shallow_tree, newdata = list(X = xgrid)))
pred_y_train <- as.numeric(predict(shallow_tree, newdata = list(X = X)))


plot(X, res, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, pred_y, lwd = 2, col = 'red')
lines(xgrid, lambda * pred_y, lwd = 2, col = 'blue')

library(xgboost)

Boston_X <- Boston
Boston_X$medv <- NULL

Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)

fit1_xgb <- xgb.cv(
  data = Boston_xgb,
  nrounds = 5000,
  nfold = 5,
  objective = "reg:squarederror",  # for regression models
  verbose = 0, # silent,
  early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)


ggplot(fit1_xgb$evaluation_log) +
  geom_line(aes(iter, train_rmse_mean), color = "red") +
  geom_line(aes(iter, test_rmse_mean), color = "blue")

fit2_xgb <- xgb.cv(
  params = list(eta = 0.01), 
  data = Boston_xgb,
  nrounds = 5000,
  nfold = 5,
  objective = "reg:squarederror",  # for regression models
  verbose = 0, # silent,
  early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)

ggplot(fit2_xgb$evaluation_log) +
  geom_line(aes(iter, train_rmse_mean), color = "red") +
  geom_line(aes(iter, test_rmse_mean), color = "blue")

Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)


# create hyperparameter grid
hyper_grid <- expand.grid(
  eta = c(.01, .05, .15, 0.3),
  max_depth = c(1, 3, 5, 7),
  optimal_trees = 0,               # a place to dump results
  min_RMSE = 0                     # a place to dump results
)


# grid search
for(i in 1:nrow(hyper_grid)) {

  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i]
  )

  # reproducibility
  set.seed(123)

  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = Boston_xgb,
    nrounds = 5000,
    nfold = 5,
    objective = "reg:squarederror",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
  )

  # add min training error and trees to grid
  hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  hyper_grid$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}

hyper_grid[order(hyper_grid$min_RMSE, decreasing = F),]
    eta max_depth optimal_trees min_RMSE
12 0.30         5            36 3.726241
6  0.05         3           208 3.777685
8  0.30         3            35 3.794773
5  0.01         3           736 3.802070
7  0.15         3            61 3.810126
10 0.05         5           204 3.829675
15 0.15         7            78 3.862303
11 0.15         5            57 3.874303
16 0.30         7            17 3.875736
13 0.01         7           755 3.916442
9  0.01         5           613 3.917884
14 0.05         7           125 3.984433
3  0.15         1            97 4.077576
2  0.05         1           305 4.083208
4  0.30         1            53 4.100558
1  0.01         1          1012 4.142000
Boston_xgb <- xgb.DMatrix(as.matrix(Boston_X), label = Boston$medv)

# parameter list
params <- list(
  eta = hyper_grid$eta[which.min(hyper_grid$min_RMSE)],
  max_depth = hyper_grid$max_depth[which.min(hyper_grid$min_RMSE)]
)

# train final model
xgb.fit.final <- xgboost(
  params = params,
  data = Boston_xgb,
  nrounds = hyper_grid$optimal_trees[which.min(hyper_grid$min_RMSE)],
  objective = "reg:squarederror",
  verbose = 0
)


# create importance matrix
importance_matrix <- xgb.importance(model = xgb.fit.final)

# variable importance plot
xgb.plot.importance(importance_matrix, top_n = 10, measure = "Gain")

Boston_ts_X <- Boston_ts
Boston_ts_X$medv <- NULL

Boston_test_xgb <- xgb.DMatrix(as.matrix(Boston_ts_X), label = Boston_ts$medv)
pred_y <- predict(xgb.fit.final, Boston_test_xgb)


plot(Boston_ts$medv, pred_y, pch = 16, xlab = 'observed', ylab = 'predicted')
abline(0, 1, lwd = 2, col = 'red')

err_boost <- (pred_y - Boston_ts$medv)^2
sqrt(mean(err_1D)); sqrt(mean(err_2D)); sqrt(mean(err_all)); sqrt(mean(err_RF)); sqrt(mean(err_boost))
[1] 6.022948
[1] 5.147602
[1] 4.743209
[1] 3.482059
[1] 2.727177